import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
#importing seaborn for statistical plots
import seaborn as sns
sns.set(color_codes = True)
# To enable plotting graphs in Jupyter notebook
%matplotlib inline
import scipy.stats as stats
import statsmodels.api as statm
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import zscore
vehicle = pd.read_csv("vehicle.csv")
vehicle.head()
vehicle.tail()
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
columns = vehicle.columns
#Let's Label Encode our class variable:
print(columns)
vehicle['class'] = le.fit_transform(vehicle['class'])
vehicle.shape
vehicle.info()
vehicle.dtypes
vehicle.describe().T
Observation:
Compactness has mean and median values almost similar , it signifies that it is normally distribited and has no skewness/outlier
circularity : it also seems to be normally distribted as mean amd median has similar values
scatter_ratio feature seems to be having some kind of skewness and outlier
vehicle.isna().apply(pd.value_counts) # checking missing value
from sklearn.impute import SimpleImputer
newdf = vehicle.copy()
X = newdf.iloc[:,0:19] #separting all numercial independent attribute
#imputer = SimpleImputer()
imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1)
#fill missing values with mean column values
transformed_values = imputer.fit_transform(X)
column = X.columns
print(column)
newdf = pd.DataFrame(transformed_values, columns = column )
newdf.describe().T
print("Original null value count:", vehicle.isnull().sum())
print("\n\nCount after we impiuted the NaN value: ", newdf.isnull().sum())
newdf.hist(bins=20, figsize=(60,40), color='lightblue', edgecolor = 'red')
plt.show()
Observations :
Most of the data attributes seems to be normally distributed
scaled valriance 1 and skewness about 1 and 2, scatter_ratio, seems to be right skewed.
pr.axis_rectangularity seems to be haing outliers as there are some gaps found in the bar plot.
plt.figure(figsize = (15,10))
plt.subplot(5,4,1)
sns.distplot(newdf['scaled_variance.1'], kde = True, rug = True, color = "coral")
plt.subplot(5,4,2)
sns.distplot(newdf['scaled_variance'], kde = True, rug = True, color = "coral")
plt.subplot(5,4,3)
sns.distplot(newdf['skewness_about.1'], kde = True, rug = True, color = "coral")
plt.subplot(5,4,4)
sns.distplot(newdf['skewness_about'], kde = True, rug = True, color = "coral")
plt.subplot(5,4,5)
sns.distplot(newdf['scatter_ratio'], kde = True, rug = True, color = "coral")
plt.show()
newdf.skew(axis = 0, skipna = True)
plt.figure(figsize = (15,10))
ax = sns.boxplot(data=newdf, orient="h")
plt.figure(figsize= (15,10))
plt.subplot(5,3,1)
sns.boxplot(x= newdf['pr.axis_aspect_ratio'], color='cyan')
plt.subplot(5,3,2)
sns.boxplot(x= newdf.skewness_about, color='hotpink')
plt.subplot(5,3,3)
sns.boxplot(x= newdf.scaled_variance, color='yellow')
plt.subplot(5,3,4)
sns.boxplot(x= newdf['radius_ratio'], color='teal')
plt.subplot(5,3,5)
sns.boxplot(x= newdf['scaled_radius_of_gyration.1'], color='lightblue')
plt.subplot(5,3,6)
sns.boxplot(x= newdf['scaled_variance.1'], color='lavender')
plt.subplot(5,3,7)
sns.boxplot(x= newdf['max.length_aspect_ratio'], color='lightgrey')
plt.subplot(5,3,8)
sns.boxplot(x= newdf['skewness_about.1'], color='pink')
plt.show()
All of the above boxplots shows outliers, which is visible with all dotted points.
from scipy.stats import iqr
Q1 = newdf.quantile(0.25)
Q3 = newdf.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
cleandf = newdf[~((newdf < (Q1 - 1.5 * IQR)) |(newdf > (Q3 + 1.5 * IQR))).any(axis=1)]
cleandf.shape
plt.figure(figsize= (15,10))
plt.subplot(5,3,1)
sns.boxplot(x= cleandf['pr.axis_aspect_ratio'], color='cyan')
plt.subplot(5,3,2)
sns.boxplot(x= cleandf.skewness_about, color='hotpink')
plt.subplot(5,3,3)
sns.boxplot(x= cleandf.scaled_variance, color='yellow')
plt.subplot(5,3,4)
sns.boxplot(x= cleandf['radius_ratio'], color='teal')
plt.subplot(5,3,5)
sns.boxplot(x= cleandf['scaled_radius_of_gyration.1'], color='lightblue')
plt.subplot(5,3,6)
sns.boxplot(x= cleandf['scaled_variance.1'], color='lavender')
plt.subplot(5,3,7)
sns.boxplot(x= cleandf['max.length_aspect_ratio'], color='lightgrey')
plt.subplot(5,3,8)
sns.boxplot(x= cleandf['skewness_about.1'], color='pink')
plt.show()
We can see that all out boxplot for all the attributes which had outlier have been treate and removed. Since no. of outliers were less we opted to remove it. Generally we avoid this as it can lead to info loss in case of large data sets with large no of outliers
#Let's Drop Class column and see the correlation Matrix & Pairplot Before using this dataframe for PCA
#as PCA should only be perfromed on independent attribute
corr_df = newdf.drop('class', axis=1)
corr = corr_df.corr()
plt.figure(figsize=(16, 10))
sns.heatmap(corr, annot=True, cmap = "YlGnBu")
plt.show()
Strong Correlation:
Scaled Variance & Scaled Variance.1 seems to be strongly correlated with value of 0.98
skewness_about_2 and hollow_ratio seems to be strongly correlated, corr coeff: 0.89
ditance_circularity and radius_ratio seems to have high positive correlation with corr coeff: 0.81
compactness & circularity , radius_ratio & pr.axis_aspect_ratio also seems ver averagely correlated with coeff: 0.67.
scaled _variance and scaled_radius_of_gyration, circularity & distance_circularity also seems to be highly correlated with corr coeff: 0.79
pr.axis_recatngularity and max.length_recatngularity also seems to be strongly correlated with coeff: 0.81
scatter_ratio and elongatedness seems to be have strong negative correlation val : 0.97
elongatedness and pr.axis_rectangularity seems to have strong negative correlation, val: 0.95
Weak/No Correlation:
max_length_aspect_ratio & radius_ratio have average correlation with coeff: 0.5
pr.axis_aspect_ratio & max_length_aspect_ratio seems to have very little correlation
scaled_radius_gyration & scaled_radisu_gyration.1 seems to be very little correlated
scaled_radius_gyration.1 & skewness_about seems to be very little correlated
skewness_about & skewness_about.1 not be correlated
skewness_about.1 and skewness_about.2 are not correlated.
sns.pairplot(corr_df, diag_kind="kde")
We found from our pairplot analysis that, Scaled Variance & Scaled Variance.1 and elongatedness and pr.axis_rectangularity are strongly correlated. They need to be treated carefully before we go for model building.
Our aim is to reocgnize whether an object is a van or bus or car based on some input features. So our main assumption is there is little or no multicollinearity between the features. If two features are highly correlated then there is no point using both features.
From above correlation matrix we can see that many features are there which having more than 0.9 correlation. We can get rid of those columns with correlation +-0.9 or above. There are 8 such columns:
We can pick one of the two highly correalated variables and drop another one. For example, Scaled Variance & Scaled Variance.1 are having strong positive correlation, so we can pick one and drop one as they will only make our dimension redundant.
Similarly between elongatedness and pr.axis_rectangularity we can pick one as they have very strong negative correlation.
#display how many are car,bus,van.
print(cleandf['class'].value_counts())
sns.countplot(cleandf['class'])
plt.show()
# we separate the target variable (class) and save it in the y variable. Also the X contains the independant variables.
X = cleandf.iloc[:,0:18].values
y = cleandf.iloc[:,18].values
# scaling the data using the standard scaler
from sklearn.preprocessing import StandardScaler
XScaled = StandardScaler().fit_transform(X)
# generating the covariance matrix and the eigen values for the PCA analysis
cov_matrix = np.cov(XScaled.T) # the relevanat covariance matrix
print("Covariance Matrix shape:",cov_matrix.shape)
print("Covariance Matrix\n", cov_matrix)
#generating the eigen values and the eigen vectors
e_vals, e_vecs = np.linalg.eig(cov_matrix)
print('Eigenvectors \n%s' %e_vecs)
print('\nEigenvalues \n%s' %e_vals)
eigen_pairs = [(np.abs(e_vals[index]), e_vecs[:,index]) for index in range(len(e_vals))]
# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue
eigen_pairs.sort(reverse=True)
eigen_pairs[:18]
# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eigen_pairs[index][0] for index in range(len(e_vals))]
eigvectors_sorted = [eigen_pairs[index][1] for index in range(len(e_vals))]
# Let's confirm our sorting worked, print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigvalues_sorted)
# the "cumulative variance explained" analysis
tot = sum(e_vals)
var_exp = [( i /tot ) * 100 for i in sorted(e_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
# Plotting the variance expalained by the principal components and the cumulative variance explained.
plt.figure(figsize=(10 , 5))
plt.bar(range(1, e_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, e_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
From above plot we can see that 8 dimensions are able to explain 95 %variance of data. So we will use first 8 principal components going forward and calulate the reduced dimensions.
Now 8 dimensions seems very reasonable. With 8 variables we can explain over 95% of the variation in the original data.
#dim_reduce represents reduced mathematical space.
dim_reduce = np.array(eigvectors_sorted[0:8]) #Reducing from 8 to 4 dimension space
XScaled_pca = np.dot(XScaled,dim_reduce.T) #projecting original data into principal component dimensions
dim_reduce = pd.DataFrame(XScaled_pca) #converting array to dataframe for pairplot
dim_reduce
sns.pairplot(dim_reduce, diag_kind='kde')
It is clealry visible from the pairplot above that after dimensionality reduction using PCA, attributes have become independent with no correlation among themselves. As most of them have cloud of data points with no linear kind of relationship.
We will use 70% of data for training and 30% for testing.
from sklearn.model_selection import train_test_split
#orginal Data
X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size = 0.30, random_state = 1)
#PCA Data
pca_X_train,pca_X_test,pca_y_train,pca_y_test = train_test_split(dim_reduce, y, test_size=0.30, random_state = 1)
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train, y_train)
#predict the y value
y_pred = clf.predict(X_test)
print ('Before PCA score', clf.score(X_test, y_test))
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
# Classification Report
print(classification_report(y_test, y_pred))
# Creates a confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
# Transform to df for easier plotting
df_cm = pd.DataFrame(cm,
index = ['van','car','bus'],
columns = ['van','car','bus'])
plt.figure(figsize=(9,6))
sns.heatmap(df_cm, annot=True, cmap='YlGnBu', fmt='g')
plt.title('Accuracy:{0:.3f}'.format(accuracy_score(y_test, y_pred)))
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
An insight we can get from the matrix is that the SVC model was very weak at classifying 'van' (True Positive/All = 0.10), However, accuracy for 'car'(93/121= 0.77) and 'bus'(39/45= 0.87) was slightly better.
clf.fit(pca_X_train, pca_y_train)
#predict the y value
pca_y_pred = clf.predict(pca_X_test)
print ('After PCA score', clf.score(pca_X_test, pca_y_test))
# Classification Report
print(classification_report(pca_y_test, pca_y_pred))
# Creates a confusion matrix
cm = metrics.confusion_matrix(pca_y_test, pca_y_pred)
# Transform to df for easier plotting
df_cm = pd.DataFrame(cm,
index = ['van','car','bus'],
columns = ['van','car','bus'])
plt.figure(figsize=(9,6))
sns.heatmap(df_cm, annot=True, cmap='YlGnBu', fmt='g')
plt.title('Accuracy:{0:.3f}'.format(accuracy_score(pca_y_test, pca_y_pred)))
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
An insight we can get from the matrix is that the SVC model after PCA was very accurate at classifying 'van' (True Positive/All = 1.0). However, accuracy for 'car' (116/121= 0.96) and 'bus' (41/45= 0.91) was lower.
In the given dataset we trained models with the orginal and dimensionally reduced datasets.
- For SVM model , We got 98% accuracy with original data.
- With PCA, we got 96% accuracy. The effects of PCA can be clearly appreciated on a datsaset.
Dimensionality Reduction plays a really important role in machine learning, especially when you are working with more number of features. Principal Components Analysis are one of the top dimensionality reduction algorithm and easy to understand.
The original dataset was composed of 18 features x 846 rows. After applying a Principal Components Analysis, I discovered that only 8 principal components were enough to keep 96% accuracy score!